import pandas as pd
file_path = r'D:\Bootcamp\Main\3_spotify_5000_songs.csv'
spotify5k_df = pd.read_csv(file_path)
spotify5k_df = spotify5k_df.rename(columns=lambda x: x.strip())
spotify5k_df.dtypes
Unnamed: 0 int64 name object artist object danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 type object duration_ms int64 time_signature int64 id object html object dtype: object
spotify5k_df.axes
[RangeIndex(start=0, stop=5235, step=1),
Index(['Unnamed: 0', 'name', 'artist', 'danceability', 'energy', 'key',
'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo', 'type', 'duration_ms', 'time_signature',
'id', 'html'],
dtype='object')]
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = spotify5k_df[features]
wcss = []
k_range = range(1, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Curve for KMeans Clustering')
plt.xticks(k_range)
plt.show()
wcss_data = pd.DataFrame({'Number of Clusters (k)': k_range, 'WCSS': wcss})
print("WCSS Data:")
wcss_data
WCSS Data:
| Number of Clusters (k) | WCSS | |
|---|---|---|
| 0 | 1 | 4.481778e+06 |
| 1 | 2 | 1.744851e+06 |
| 2 | 3 | 8.550621e+05 |
| 3 | 4 | 5.981134e+05 |
| 4 | 5 | 4.404780e+05 |
| 5 | 6 | 3.650132e+05 |
| 6 | 7 | 3.213276e+05 |
| 7 | 8 | 2.811246e+05 |
| 8 | 9 | 2.543345e+05 |
| 9 | 10 | 2.315212e+05 |
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer
# Define the features
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
# Extract features from the dataframe (assuming spotify5k_df is defined elsewhere)
# X = spotify5k_df[features]
# Number of clusters
k = 4
# List of scaler names for plotting and labeling
scaler_names = ['Raw', 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'PowerTransformer']
# Dictionary to store Within-Cluster-Sum-of-Squares (WCSS) for each scaler
wcss_dict = {}
# Define colors for better readability
colors = sns.color_palette("tab10")
# Dictionary to store cluster centers for each scaler
cluster_centers = {}
# Iterate over each scaler
for i, scaler in enumerate([None, StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler(), PowerTransformer()]):
# Scale the features if scaler is not None
if scaler is not None:
X_scaled = scaler.fit_transform(X)
scaler_name = scaler_names[i]
else:
X_scaled = X
scaler_name = 'Raw Data'
# Initialize KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
# Calculate WCSS
wcss = kmeans.inertia_
wcss_dict[scaler_name] = wcss
# Store cluster centers
cluster_centers[scaler_name] = kmeans.cluster_centers_
# Plot radar chart for cluster centers
plt.figure(figsize=(10, 6))
angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False).tolist()
angles += angles[:1] # Ensure closed loop
ax = plt.subplot(111, polar=True)
# Plot each cluster's centroid
for idx, center in enumerate(kmeans.cluster_centers_):
values = np.round(np.concatenate((center, [center[0]])), 2) # Round to two decimal points
ax.plot(angles, values, marker='o', linestyle='-', color=colors[idx], linewidth=2, label=f'Cluster {idx+1}')
# Fill the area enclosed by each cluster's centroid with a light blue shade
ax.fill(angles, values, color=colors[idx], alpha=0.25)
# Set the labels for each axis
ax.set_yticklabels([])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(features, fontsize=10)
plt.title(f'Radar Chart for Cluster Centers ({scaler_name})', loc='left', fontsize=12, pad=20)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=k)
plt.show()
# Create DataFrame to display WCSS for the current scaler
scaler_df = pd.DataFrame(list(wcss_dict.items()), columns=['Scaler', 'WCSS'])
# Display the table for the current scaler
print(f"{scaler_name} chart")
print(f"Table for {scaler_name}:")
display(scaler_df)
# Find the best scaler based on the minimum WCSS
best_scaler = min(wcss_dict, key=wcss_dict.get)
print(f"\nBased on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is {best_scaler}. Using this scaler helps in minimizing the WCSS, indicating better cluster formation.")
print(f"\nChoosing an appropriate scaler is crucial as it affects the clustering results. With {k} clusters, it is recommended to use the {best_scaler} scaler to create the clusters.")
# Add explanation points based on the WCSS score
print("\nAdditional points: \n1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. \n2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. \n3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.")
Raw Data chart Table for Raw Data:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
StandardScaler chart Table for StandardScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
MinMaxScaler chart Table for MinMaxScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
RobustScaler chart Table for RobustScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
MaxAbsScaler chart Table for MaxAbsScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
PowerTransformer chart Table for PowerTransformer:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
| 5 | PowerTransformer | 24975.059530 |
Based on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is MinMaxScaler. Using this scaler helps in minimizing the WCSS, indicating better cluster formation. Choosing an appropriate scaler is crucial as it affects the clustering results. With 4 clusters, it is recommended to use the MinMaxScaler scaler to create the clusters. Additional points: 1) A lower WCSS score indicates that the data points within each cluster are closer to their respective centroids, implying more compact and well-separated clusters. 2) By selecting the scaler that yields the lowest WCSS, we aim to achieve the most meaningful and distinct cluster separation. 3) The shaded area in each radar chart represents the coverage or extent of each cluster's features in the scaled feature space. It visually demonstrates how different clusters vary in their feature composition and distribution.
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = spotify5k_df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
wcss = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()
elbow_df = pd.DataFrame({'Number of Clusters': range(1, 11), 'WCSS': wcss})
elbow_df
silhouette_scores = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal Number of Clusters')
plt.show()
silhouette_df = pd.DataFrame({'Number of Clusters': range(2, 11), 'Silhouette Score': silhouette_scores})
silhouette_df
| Number of Clusters | Silhouette Score | |
|---|---|---|
| 0 | 2 | 0.333466 |
| 1 | 3 | 0.270886 |
| 2 | 4 | 0.276123 |
| 3 | 5 | 0.267038 |
| 4 | 6 | 0.247665 |
| 5 | 7 | 0.215942 |
| 6 | 8 | 0.213132 |
| 7 | 9 | 0.196396 |
| 8 | 10 | 0.187944 |
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()
plt.plot(range(1, len(explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio by Principal Components')
plt.grid(False)
plt.show()
num_components = 5
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_pca)
labels = kmeans.labels_
spotify5k_df['Cluster'] = labels
cluster_counts = spotify5k_df['Cluster'].value_counts()
print("Cluster Counts:\n", cluster_counts)
C:\Users\Rustoo\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Cluster Counts: Cluster 2 2602 3 1343 0 878 1 412 Name: count, dtype: int64
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(spotify5k_df[features])
kmeans = KMeans(n_clusters=4, random_state=42)
spotify5k_df['Cluster'] = kmeans.fit_predict(X)
numeric_columns = spotify5k_df.select_dtypes(include=['float64', 'int64'])
cluster_means = spotify5k_df.groupby('Cluster')[numeric_columns.columns].mean()
print("Cluster Characteristics (Mean Feature Values):\n")
cluster_means
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=spotify5k_df['Cluster'], cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clusters of Songs')
plt.colorbar(label='Cluster')
plt.show()
for feature in features:
plt.figure(figsize=(8, 6))
for cluster_id in range(4):
cluster_data = spotify5k_df[spotify5k_df['Cluster'] == cluster_id][feature]
plt.hist(cluster_data, bins=20, alpha=0.6, label=f'Cluster {cluster_id}')
plt.title(f'Distribution of {feature} by Cluster')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.legend()
plt.show()
Cluster Characteristics (Mean Feature Values):
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, spotify5k_df['Cluster'], test_size=0.2, random_state=42)
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.9684813753581661
spotify5k_df.dtypes
Unnamed: 0 int64 name object artist object danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 type object duration_ms int64 time_signature int64 id object html object Cluster int32 dtype: object
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Define a function to normalize tempo
def normalize_tempo(tempo):
max_tempo = spotify5k_df['tempo'].max()
min_tempo = spotify5k_df['tempo'].min()
return (tempo - min_tempo) / (max_tempo - min_tempo)
# Apply the normalization function to the tempo column
spotify5k_df['normalized_tempo'] = spotify5k_df['tempo'].apply(normalize_tempo)
cluster_names = {
0: 'Serene Sounds',
1: 'Pulsating Rhythms',
2: 'Tranquil Tunes',
3: 'Melancholic Melodies',
}
# Use 'normalized_tempo' instead of 'tempo' in cluster statistics
cluster_statistics = spotify5k_df.groupby('Cluster').agg({
'danceability': 'mean',
'energy': 'mean',
'valence': 'mean',
'normalized_tempo': 'mean', # Use 'normalized_tempo' instead of 'tempo'
'acousticness': 'mean',
'speechiness': 'mean'
})
cluster_explanations = {
0: f"Songs with serene and calming vibes characterized by high valence ({cluster_statistics.loc[0, 'valence']:.2f}) and moderate tempo ({cluster_statistics.loc[0, 'normalized_tempo']:.2f}).",
1: f"Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy ({cluster_statistics.loc[1, 'energy']:.2f}) and tempo ({cluster_statistics.loc[1, 'normalized_tempo']:.2f}).",
2: f"Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence ({cluster_statistics.loc[2, 'valence']:.2f}) and tempo ({cluster_statistics.loc[2, 'normalized_tempo']:.2f}).",
3: f"Melancholic tunes with low valence ({cluster_statistics.loc[3, 'valence']:.2f}) and a somber atmosphere, often featuring high acousticness ({cluster_statistics.loc[3, 'acousticness']:.2f}).",
}
for cluster_id, name in cluster_names.items():
print(f"Cluster {cluster_id} ({name}): {cluster_explanations[cluster_id]}")
display(cluster_statistics.loc[[cluster_id]])
plt.figure(figsize=(10, 6))
sns.set(style="white") # Remove grid
sns.barplot(x=cluster_statistics.columns, y=cluster_statistics.loc[cluster_id].values, palette="magma")
plt.title(f'Cluster {cluster_id} - {name} Features')
plt.xticks(rotation=45)
for index, value in enumerate(cluster_statistics.loc[cluster_id]):
plt.text(index, value, f'{value:.2f}', ha='center', va='bottom')
plt.show()
print('\n')
Cluster 0 (Serene Sounds): Songs with serene and calming vibes characterized by high valence (0.61) and moderate tempo (0.56).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 0 | 0.645645 | 0.684244 | 0.611999 | 0.564491 | 0.236267 | 0.056894 |
Cluster 1 (Pulsating Rhythms): Tracks featuring pulsating rhythms and high energy suitable for dancing with high energy (0.15) and tempo (0.49).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 1 | 0.354487 | 0.146981 | 0.2049 | 0.491445 | 0.913782 | 0.045184 |
Cluster 2 (Tranquil Tunes): Music with tranquil melodies and moderate energy levels, perfect for unwinding with a balanced mix of valence (0.58) and tempo (0.58).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 2 | 0.683221 | 0.738124 | 0.579874 | 0.577067 | 0.17778 | 0.267501 |
Cluster 3 (Melancholic Melodies): Melancholic tunes with low valence (0.23) and a somber atmosphere, often featuring high acousticness (0.01).
| danceability | energy | valence | normalized_tempo | acousticness | speechiness | |
|---|---|---|---|---|---|---|
| Cluster | ||||||
| 3 | 0.288293 | 0.904954 | 0.225855 | 0.57095 | 0.014879 | 0.103418 |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 10))
for cluster_id, name in cluster_names.items():
plt.scatter(cluster_statistics.loc[cluster_id, 'normalized_tempo'],
cluster_statistics.loc[cluster_id, 'valence'],
label=name,
s=200,
alpha=0.7)
plt.text(cluster_statistics.loc[cluster_id, 'normalized_tempo'],
cluster_statistics.loc[cluster_id, 'valence'],
f"{name}\n{cluster_explanations[cluster_id]}",
fontsize=10,
ha='center',
va='center',
wrap=True)
plt.xlabel('Normalized Tempo', fontsize=12)
plt.ylabel('Valence', fontsize=12)
plt.title('Cluster Analysis based on Tempo and Valence', fontsize=14)
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=2, fontsize=10)
plt.grid(False)
plt.margins(0.05)
plt.show()
# Explanation
print("\nExplanation:")
print("We chose to plot tempo and valence as they are two key features that determine the mood of a song.")
print("Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content.")
print("By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics.")
# Conclusion
print("\nConclusion:")
print("Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics.")
print("Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.")
Explanation: We chose to plot tempo and valence as they are two key features that determine the mood of a song. Tempo indicates the speed or pace of the music, while valence represents the positivity or negativity of the musical content. By analyzing these two features, we are able to identify distinct clusters representing songs with different mood characteristics. Conclusion: Based on the clustering analysis, we identified distinct clusters representing songs with different mood characteristics. Machine learning can be a valuable tool for creating playlists as it automatically categorizes songs based on their features, helping users discover music that matches their mood and preferences.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
spotify5k_df['text'] = spotify5k_df['name'] + ' ' + spotify5k_df['artist']
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(spotify5k_df['text'])
vocab = vectorizer.get_feature_names_out()
word_counts = pd.DataFrame(X.toarray(), columns=vocab)
word_counts['Cluster'] = spotify5k_df['Cluster']
cluster_names = {
0: 'Relaxing Vibes',
1: 'Energetic Beats',
2: 'Chill Out',
3: 'Melancholic Melodies'
}
for cluster_id in range(len(word_counts['Cluster'].unique())):
words_in_cluster = word_counts[word_counts['Cluster'] == cluster_id].drop('Cluster', axis=1)
word_freq = words_in_cluster.sum().to_dict()
wordcloud = WordCloud(width=800, height=400, background_color='white', prefer_horizontal=0.9).generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
plt.axis('off')
plt.show()
top_words = words_in_cluster.sum().sort_values(ascending=False).head(10)
sns.set_palette('bright')
plt.figure(figsize=(10, 6))
sns.barplot(x=top_words.values, y=top_words.index)
plt.title(f'Top 10 Words in {cluster_names.get(cluster_id, "Cluster " + str(cluster_id))}')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
for i, (word, count) in enumerate(zip(top_words.index, top_words.values)):
plt.text(count, i, f' {word} ({count})', fontsize=10, style='italic', va='center')
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist
from warnings import simplefilter
# Ignore future warnings
simplefilter(action='ignore', category=FutureWarning)
# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo']
# Scale the features
try:
scaler = StandardScaler()
X = scaler.fit_transform(spotify5k_df[features])
except KeyError:
print("Error: Features not found in the dataset.")
# Use PCA to determine the number of clusters
try:
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X)
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)
except ValueError:
print("Error: Not enough features for PCA.")
# Find optimal number of clusters using silhouette score
best_score = -1
best_k = -1
for k in range(2, 11):
try:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca)
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
if silhouette_avg > best_score:
best_score = silhouette_avg
best_k = k
except ValueError:
print("Error: Unable to find optimal number of clusters.")
# Train KMeans with optimal number of clusters
try:
kmeans = KMeans(n_clusters=best_k, random_state=42)
kmeans.fit(X_pca)
spotify5k_df['cluster'] = kmeans.labels_
except ValueError:
print("Error: Unable to train KMeans model.")
# Function to calculate diversity of recommended songs
def calculate_diversity(recommended_songs):
try:
recommended_features = recommended_songs[features].to_numpy()
cosine_distances = pdist(recommended_features, metric='cosine')
avg_cosine_distance = np.mean(cosine_distances)
diversity = 1 - avg_cosine_distance
return diversity
except KeyError:
print("Error: Features not found in recommended songs.")
# Function to recommend songs from a given cluster
def recommend_songs(cluster_id, num_songs=5):
try:
cluster_data = spotify5k_df[spotify5k_df['cluster'] == cluster_id]
recommended_songs = cluster_data.sample(min(num_songs, len(cluster_data)))
return recommended_songs
except KeyError:
print("Error: Cluster ID not found.")
# Visualize clusters
def visualize_clusters(X_pca, labels, centroids):
plt.figure(figsize=(10, 6))
for i in range(len(np.unique(labels))):
plt.scatter(X_pca[labels == i, 0], X_pca[labels == i, 1], label=f'Cluster {i}')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='black', label='Centroids')
plt.title('Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
# Visualize explained variance ratio
def visualize_variance(pca):
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance Ratio')
plt.grid(True)
plt.show()
# Visualize silhouette scores
def visualize_silhouette_scores(scores):
plt.figure(figsize=(8, 6))
plt.plot(range(2, 11), scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()
# Example: Recommend songs from each cluster and calculate diversity
try:
print("**Playlist Recommendations and Diversity Scores:**")
playlists = []
for cluster_id in range(best_k):
recommended_songs = recommend_songs(cluster_id)
playlist_name = f"Playlist {cluster_id + 1}: {', '.join(recommended_songs['name'].tolist())}"
playlists.append({'Name': playlist_name, 'Diversity': calculate_diversity(recommended_songs)})
playlists_df = pd.DataFrame(playlists)
display(playlists_df)
except TypeError:
print("Error: Unable to calculate diversity.")
# Visualizations
print("\n**Visualizations:**")
visualize_clusters(X_pca, kmeans.labels_, kmeans.cluster_centers_)
visualize_variance(pca)
visualize_silhouette_scores([silhouette_score(X_pca, kmeans.labels_) for k in range(2, 11)])
# Answers to questions
print("\n**Answers to Questions:**")
print("\n**How did you create your prototype?**")
print("The prototype was created using Python with the scikit-learn library for machine learning algorithms.\n")
print("**How many playlists (clusters) are there?**")
print("The number of playlists (clusters) is determined dynamically based on the data using the silhouette score.\n")
print("**What audio features did you use and what did you drop? Why?**")
print("We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics.")
print("We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'.\n")
print("**Is the prototype effective at creating cohesive playlists?**")
print("Cohesiveness of playlists can be evaluated based on diversity and user feedback.\n")
print("**Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?**")
print("This can be determined through user feedback and comparison with manually curated playlists.\n")
print("**What kind of data might help us create better playlists?**")
print("Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality.\n")
print("**Is K-Means a good method for creating playlists? Provide pros and cons.**")
print("Pros:")
print("- Simple and easy to implement.")
print("- Scalable to large datasets.")
print("\nCons:")
print("- Assumes clusters are spherical and of equal size.")
print("- Sensitive to initialization.\n")
print("**What would be your next steps if you continued with this project?**")
print("Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.")
**Playlist Recommendations and Diversity Scores:**
| Name | Diversity | |
|---|---|---|
| 0 | Playlist 1: Sexy Movimiento ... | 0.998082 |
| 1 | Playlist 2: Tea for Two ... | 0.984093 |
**Visualizations:**
**Answers to Questions:** **How did you create your prototype?** The prototype was created using Python with the scikit-learn library for machine learning algorithms. **How many playlists (clusters) are there?** The number of playlists (clusters) is determined dynamically based on the data using the silhouette score. **What audio features did you use and what did you drop? Why?** We used features like danceability, energy, loudness, etc., as they are relevant to song characteristics. We dropped features that were not considered to significantly influence playlist creation, such as 'duration_ms' or 'time_signature'. **Is the prototype effective at creating cohesive playlists?** Cohesiveness of playlists can be evaluated based on diversity and user feedback. **Are Spotify’s audio features capable of identifying 'similar songs' as defined by humanly detectable criteria?** This can be determined through user feedback and comparison with manually curated playlists. **What kind of data might help us create better playlists?** Additional data such as user preferences, listening history, genre information, etc., can improve playlist quality. **Is K-Means a good method for creating playlists? Provide pros and cons.** Pros: - Simple and easy to implement. - Scalable to large datasets. Cons: - Assumes clusters are spherical and of equal size. - Sensitive to initialization. **What would be your next steps if you continued with this project?** Further refinement of clustering algorithms, incorporation of user feedback, and integration with a music streaming platform for real-time playlist generation.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Perform sentiment analysis on the 'name' column and handle NaNs
spotify5k_df['sentiment_score'] = spotify5k_df['name'].fillna('').apply(lambda x: sia.polarity_scores(x)['compound'])
# Fit a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(spotify5k_df['name'])
# Find optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)
silhouette_scores.append(silhouette_score(X, cluster_labels))
optimal_clusters_silhouette = silhouette_scores.index(max(silhouette_scores)) + 2
# Apply PCA to visualize clusters
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())
# Find optimal number of clusters using PCA
inertia = []
for n_clusters in range(2, 11):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_pca)
inertia.append(kmeans.inertia_)
optimal_clusters_pca = inertia.index(min(inertia)) + 2
# Cluster using KMeans with optimal number of clusters
kmeans_silhouette = KMeans(n_clusters=optimal_clusters_silhouette, random_state=42)
spotify5k_df['cluster_silhouette'] = kmeans_silhouette.fit_predict(X)
kmeans_pca = KMeans(n_clusters=optimal_clusters_pca, random_state=42)
spotify5k_df['cluster_pca'] = kmeans_pca.fit_predict(X)
# Define cluster names based on sentiment
sentiment_cluster_names = {
0: 'Negative (Low Sentiment)',
1: 'Neutral (Medium Sentiment)',
2: 'Positive (High Sentiment)'
}
# Assign cluster names
spotify5k_df['cluster_silhouette'] = spotify5k_df['cluster_silhouette'].map(sentiment_cluster_names)
spotify5k_df['cluster_pca'] = spotify5k_df['cluster_pca'].map(sentiment_cluster_names)
# Explanation for choosing the number of clusters
explanation = f"The number of clusters chosen based on silhouette score: {optimal_clusters_silhouette}. " \
f"The number of clusters chosen based on PCA: {optimal_clusters_pca}."
# Visualize clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=spotify5k_df['cluster_pca'], palette='husl', legend='full', marker='o')
plt.title('PCA Visualization of Song Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()
# Visualize clusters using Silhouette Score
plt.figure(figsize=(10, 6))
sns.lineplot(x=range(2, 11), y=silhouette_scores, marker='o', color='blue')
plt.title('Silhouette Score for Optimal Cluster Selection')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(False)
plt.xticks(range(2, 11))
plt.show()
# Additional visualizations
plt.figure(figsize=(10, 6))
sns.histplot(data=spotify5k_df, x='sentiment_score', bins=30, kde=True, color='green')
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', palette='husl')
plt.title('Number of Songs in Each Sentiment Cluster')
plt.xlabel('Sentiment Cluster')
plt.ylabel('Count')
plt.grid(False)
plt.show()
# Output DataFrame with cluster assignments and remove NaNs
cluster_silhouette_df = spotify5k_df[['name', 'cluster_silhouette']].dropna()
cluster_pca_df = spotify5k_df[['name', 'cluster_pca']].dropna()
# Output sentiment score for each cluster
sentiment_scores = spotify5k_df.groupby('cluster_pca')['sentiment_score'].mean()
# Explanation for cluster names
cluster_name_explanation = f"Cluster names are based on sentiment score: " \
f"Negative (Low Sentiment): sentiment score < 0, " \
f"Neutral (Medium Sentiment): sentiment score ≈ 0, " \
f"Positive (High Sentiment): sentiment score > 0."
# Display the outputs
print(explanation)
print(cluster_name_explanation)
display(cluster_silhouette_df)
display(cluster_pca_df)
display(sentiment_scores)
The number of clusters chosen based on silhouette score: 10. The number of clusters chosen based on PCA: 10. Cluster names are based on sentiment score: Negative (Low Sentiment): sentiment score < 0, Neutral (Medium Sentiment): sentiment score ≈ 0, Positive (High Sentiment): sentiment score > 0.
| name | cluster_silhouette | |
|---|---|---|
| 10 | The Girl From Ipanema ... | Positive (High Sentiment) |
| 18 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 21 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 23 | Don't Stop the Carnival ... | Positive (High Sentiment) |
| 28 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| ... | ... | ... |
| 5197 | The Unanswered Question ... | Positive (High Sentiment) |
| 5201 | The Planets - Suite for large orchestra, Op.32... | Positive (High Sentiment) |
| 5213 | The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... | Positive (High Sentiment) |
| 5226 | Pines Of Rome, P. 141: 3. The Pines Of The Jan... | Positive (High Sentiment) |
| 5229 | A Flock Descends Into The Pentagonal Garden ... | Positive (High Sentiment) |
621 rows × 2 columns
| name | cluster_pca | |
|---|---|---|
| 10 | The Girl From Ipanema ... | Positive (High Sentiment) |
| 18 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 21 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| 23 | Don't Stop the Carnival ... | Positive (High Sentiment) |
| 28 | Aquarela Do Brasil ... | Negative (Low Sentiment) |
| ... | ... | ... |
| 5197 | The Unanswered Question ... | Positive (High Sentiment) |
| 5201 | The Planets - Suite for large orchestra, Op.32... | Positive (High Sentiment) |
| 5213 | The Nutcracker, Op.71, TH.14 / Act 1: No. 2 Ma... | Positive (High Sentiment) |
| 5226 | Pines Of Rome, P. 141: 3. The Pines Of The Jan... | Positive (High Sentiment) |
| 5229 | A Flock Descends Into The Pentagonal Garden ... | Positive (High Sentiment) |
621 rows × 2 columns
cluster_pca Negative (Low Sentiment) 0.114416 Neutral (Medium Sentiment) 0.072395 Positive (High Sentiment) -0.044409 Name: sentiment_score, dtype: float64
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from IPython.display import display
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Perform sentiment analysis on the objective columns and handle NaNs
objective_columns = ['danceability', 'energy', 'key', 'loudness', 'mode',
'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo']
spotify5k_df['objective_sentiment_score'] = spotify5k_df[objective_columns].fillna('').apply(
lambda x: sia.polarity_scores(str(x))['compound'])
# Fit a PCA to visualize objective clusters
X_objective = spotify5k_df[objective_columns].fillna(0) # Fill NaNs with 0 for PCA
pca_objective = PCA(n_components=2, random_state=42)
X_pca_objective = pca_objective.fit_transform(X_objective)
# Cluster using KMeans with optimal number of clusters
kmeans_objective = KMeans(n_clusters=3, random_state=42)
spotify5k_df['objective_cluster'] = kmeans_objective.fit_predict(X_objective)
# Define cluster names based on objective sentiment
objective_cluster_names = {
0: 'Low',
1: 'Medium',
2: 'High'
}
# Assign cluster names
spotify5k_df['objective_cluster'] = spotify5k_df['objective_cluster'].map(objective_cluster_names)
# Explanation for objective sentiment analysis
objective_explanation = "Objective sentiment analysis was performed based on the following columns: " \
"'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', " \
"'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. " \
"Three clusters were chosen for better interpretation: Low, Medium, and High."
# Visualize objective clusters using PCA
plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca_objective[:, 0], y=X_pca_objective[:, 1],
hue=spotify5k_df['objective_cluster'], palette='husl',
legend='full', marker='o')
plt.title('PCA Visualization of Objective Sentiment Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.grid(False)
plt.show()
# Compare with sentiment analysis based on 'name' column
plt.figure(figsize=(10, 6))
sns.countplot(data=spotify5k_df, x='cluster_pca', hue='objective_cluster', palette='husl')
plt.title('Comparison of Sentiment Clusters (Name vs Objective Columns)')
plt.xlabel('Sentiment Cluster (Name)')
plt.ylabel('Count')
plt.legend(title='Objective Cluster', loc='upper right')
plt.grid(False)
plt.show()
# Display the outputs
display(objective_explanation)
display(spotify5k_df[['name', 'objective_cluster']])
"Objective sentiment analysis was performed based on the following columns: 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', and 'tempo'. Three clusters were chosen for better interpretation: Low, Medium, and High."
| name | objective_cluster | |
|---|---|---|
| 0 | Se Eu Quiser Falar Com Deus ... | Medium |
| 1 | Saudade De Bahia ... | Medium |
| 2 | Canta Canta, Minha Gente ... | Low |
| 3 | Mulher Eu Sei ... | Low |
| 4 | Rosa Morena ... | Medium |
| ... | ... | ... |
| 5230 | 1812 Festival Overture, Op. 49: 1812 Overture,... | Low |
| 5231 | Winter Fragments pour ensemble instrumental, s... | Low |
| 5232 | Schoenberg: 5 Orchestral Pieces, Op. 16: No. 3... | Low |
| 5233 | Serenade For Strings In E, Op.22, B. 52: 1. Mo... | Low |
| 5234 | Ravel: Boléro, M. 81 ... | Low |
5235 rows × 2 columns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# Select features for clustering
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo']
# Explanation: These features are chosen because they represent different aspects of songs
print("Selected Features for Clustering:")
print(pd.DataFrame(features, columns=['Features']))
# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(spotify5k_df[features])
# Explanation: StandardScaler is used to standardize the features, ensuring each feature has a mean of 0 and a standard deviation of 1.
print("\nFeature Scaling:")
print("The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.")
# Create a DataFrame of scaled features for visualization
scaled_df = pd.DataFrame(X_scaled, columns=features)
# Plot histograms of scaled features
plt.figure(figsize=(12, 8))
for i, feature in enumerate(features):
plt.subplot(3, 3, i + 1)
plt.hist(scaled_df[feature], bins=20, color='skyblue', edgecolor='black')
plt.title(feature)
plt.xlabel('Scaled Values')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Use PCA to reduce dimensionality
pca = PCA(n_components=len(features))
X_pca = pca.fit_transform(X_scaled)
# Explanation: PCA is used to reduce the dimensionality of the data while retaining most of its variance.
print("\nDimensionality Reduction with PCA:")
print("Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.")
# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(features) + 1), pca.explained_variance_ratio_, marker='o', linestyle='--', color='b')
plt.title('Explained Variance Ratio by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.xticks(np.arange(1, len(features) + 1))
plt.grid(False)
plt.show()
# Find optimal number of clusters using silhouette score
silhouette_scores = []
for k in range(2, 101):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_pca)
silhouette_avg = silhouette_score(X_pca, kmeans.labels_)
silhouette_scores.append(silhouette_avg)
# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 101), silhouette_scores, marker='o', linestyle='-', color='r')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(np.arange(2, 101, 5))
plt.grid(False)
plt.show()
# Based on business requirements, choose the number of clusters
# We aim to create playlists with sizes between 50 and 250 songs
# So, let's explore having between 20 and 100 clusters
print("\nChoosing Number of Clusters:")
print("Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs.")
print("We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.")
Selected Features for Clustering:
Features
0 danceability
1 energy
2 loudness
3 speechiness
4 acousticness
5 instrumentalness
6 liveness
7 valence
8 tempo
Feature Scaling:
The features are scaled using StandardScaler to standardize the data, making it suitable for clustering.
Dimensionality Reduction with PCA: Principal Component Analysis (PCA) is applied to reduce the dimensionality of the data while retaining most of its variance.
Choosing Number of Clusters: Based on business requirements, we aim to create playlists with sizes between 50 and 250 songs. We will explore creating between 20 and 100 clusters to ensure playlist sizes between 50 and 250 songs.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, PowerTransformer
# Define the features
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
# Extract features from the dataframe
X = spotify5k_df[features]
# Number of clusters
k = 4
# List of scaler names for plotting and labeling
scaler_names = ['Raw', 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'PowerTransformer']
# Dictionary to store Within-Cluster-Sum-of-Squares (WCSS) for each scaler
wcss_dict = {}
# Define colors for better readability
colors = sns.color_palette("tab10")
# Dictionary to store cluster centers for each scaler
cluster_centers = {}
# Iterate over each scaler
for i, scaler in enumerate([None, StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler(), PowerTransformer()]):
# Scale the features if scaler is not None
if scaler is not None:
X_scaled = scaler.fit_transform(X)
scaler_name = scaler_names[i]
else:
X_scaled = X
scaler_name = 'Raw Data'
# Initialize KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
# Calculate WCSS
wcss = kmeans.inertia_
wcss_dict[scaler_name] = wcss
# Store cluster centers
cluster_centers[scaler_name] = kmeans.cluster_centers_
# Plot radar chart for cluster centers
plt.figure(figsize=(10, 6))
angles = np.linspace(0, 2 * np.pi, len(features), endpoint=False).tolist()
angles += angles[:1] # Ensure closed loop
ax = plt.subplot(111, polar=True)
# Plot each cluster's centroid
for idx, center in enumerate(kmeans.cluster_centers_):
values = np.concatenate((center, [center[0]]))
ax.plot(angles, values, marker='o', linestyle='-', color=colors[idx], linewidth=2, label=f'Cluster {idx+1}')
ax.fill(angles, values, alpha=0.25)
# Set the labels for each axis
ax.set_yticklabels([])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(features, fontsize=10)
plt.title(f'Radar Chart for Cluster Centers ({scaler_name})', loc='left', fontsize=12, pad=20)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=k)
plt.show()
# Create DataFrame to display WCSS for the current scaler
scaler_df = pd.DataFrame(list(wcss_dict.items()), columns=['Scaler', 'WCSS'])
# Display the table for the current scaler
print(f"{scaler_name} chart")
print(f"Table for {scaler_name}:")
display(scaler_df)
# Find the best scaler based on the minimum WCSS
best_scaler = min(wcss_dict, key=wcss_dict.get)
# Print the best scaler and its impact on creating clusters
print(f"\nBased on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is {best_scaler}.")
print("Using this scaler helps in minimizing the WCSS, indicating better cluster formation.")
print("Choosing an appropriate scaler is crucial as it affects the clustering results.")
print(f"With {k} clusters, it is recommended to use the {best_scaler} scaler to create the clusters.")
Raw Data chart Table for Raw Data:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
StandardScaler chart Table for StandardScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
MinMaxScaler chart Table for MinMaxScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
RobustScaler chart Table for RobustScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
MaxAbsScaler chart Table for MaxAbsScaler:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
PowerTransformer chart Table for PowerTransformer:
| Scaler | WCSS | |
|---|---|---|
| 0 | Raw Data | 598113.363521 |
| 1 | StandardScaler | 24522.213055 |
| 2 | MinMaxScaler | 998.931204 |
| 3 | RobustScaler | 16081.887091 |
| 4 | MaxAbsScaler | 999.866939 |
| 5 | PowerTransformer | 24975.059530 |
Based on the Within-Cluster-Sum-of-Squares (WCSS), the best scaler to use is MinMaxScaler. Using this scaler helps in minimizing the WCSS, indicating better cluster formation. Choosing an appropriate scaler is crucial as it affects the clustering results. With 4 clusters, it is recommended to use the MinMaxScaler scaler to create the clusters.